In [248]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

As we have 2 datasets, we need to work a little bit to merge and selected data from each of them to create our own dataset.

In [249]:
fpGlobal = "../input/covid19-global-forecasting-week-3/train.csv"
gl = pd.read_csv(fpGlobal)

fpEnr = "/kaggle/input/covid19-enriched-dataset-week-2/enriched_covid_19_week_2.csv"
enr = pd.read_csv(fpEnr)

fpPop = "/kaggle/input/covcsd-covid19-countries-statistical-dataset/"
#pop = pd.read_csv(fpPop)
In [250]:
gl.tail()
#train.info()
Out[250]:
Id Province_State Country_Region Date ConfirmedCases Fatalities
23557 32708 NaN Zimbabwe 2020-04-03 9.0 1.0
23558 32709 NaN Zimbabwe 2020-04-04 9.0 1.0
23559 32710 NaN Zimbabwe 2020-04-05 9.0 1.0
23560 32711 NaN Zimbabwe 2020-04-06 10.0 1.0
23561 32712 NaN Zimbabwe 2020-04-07 11.0 2.0
In [251]:
enr.head()
Out[251]:
Id Country_Region Province_State Date ConfirmedCases Fatalities age_0-4 age_5-9 age_10-14 age_15-19 ... smokers_perc density urbanpop hospibed lung femalelung malelung restrictions quarantine schools
0 1 Afghanistan NaN 2020-01-22 0.0 0.0 0.145717 0.139133 0.133376 0.118922 ... 21.389448 60.0 25.0 0.5 37.62 36.31 39.33 0 0 0
1 2 Afghanistan NaN 2020-01-23 0.0 0.0 0.145717 0.139133 0.133376 0.118922 ... 21.389448 60.0 25.0 0.5 37.62 36.31 39.33 0 0 0
2 3 Afghanistan NaN 2020-01-24 0.0 0.0 0.145717 0.139133 0.133376 0.118922 ... 21.389448 60.0 25.0 0.5 37.62 36.31 39.33 0 0 0
3 4 Afghanistan NaN 2020-01-25 0.0 0.0 0.145717 0.139133 0.133376 0.118922 ... 21.389448 60.0 25.0 0.5 37.62 36.31 39.33 0 0 0
4 5 Afghanistan NaN 2020-01-26 0.0 0.0 0.145717 0.139133 0.133376 0.118922 ... 21.389448 60.0 25.0 0.5 37.62 36.31 39.33 0 0 0

5 rows × 38 columns

In [ ]:
enr.columns
enr.loc[:,"Country_Region"].unique() ## 293

Lets see what we have as columns in our dataset:

In [252]:
import os
s = len(fpPop)
af = pd.read_csv("../input/covcsd-covid19-countries-statistical-dataset/Afghanistan_COVID19.csv")
af.head()
print(af.columns)
print(enr.columns)
Index(['Date', 'State', 'Country', 'Cumulative_cases', 'Cumulative_death',
       'Daily_cases', 'Daily_death', 'Latitude', 'Longitude', 'Temperature',
       'Min_temperature', 'Max_temperature', 'Wind_speed', 'Precipitation',
       'Fog_Presence', 'Population', 'Population Density/km', 'Median_Age',
       'Sex Ratio', 'Age%_65+', 'Hospital Beds/1000', 'Available Beds/1000',
       'Confirmed Cases/1000', 'Lung Patients (F)', 'Lung Patients (M)',
       'Median Age (Years)', 'Life Expectancy (M)', 'Life Expectancy (F)',
       'Total_tests_conducted', 'Out_Travels (mill.)', 'In_travels(mill.)',
       'Domestic_Travels (mill.)'],
      dtype='object')
Index(['Id', 'Country_Region', 'Province_State', 'Date', 'ConfirmedCases',
       'Fatalities', 'age_0-4', 'age_5-9', 'age_10-14', 'age_15-19',
       'age_20-24', 'age_25-29', 'age_30-34', 'age_35-39', 'age_40-44',
       'age_45-49', 'age_50-54', 'age_55-59', 'age_60-64', 'age_65-69',
       'age_70-74', 'age_75-79', 'age_80-84', 'age_85-89', 'age_90-94',
       'age_95-99', 'age_100+', 'total_pop', 'smokers_perc', 'density',
       'urbanpop', 'hospibed', 'lung', 'femalelung', 'malelung',
       'restrictions', 'quarantine', 'schools'],
      dtype='object')
In [253]:
af.head()
Out[253]:
Date State Country Cumulative_cases Cumulative_death Daily_cases Daily_death Latitude Longitude Temperature ... Confirmed Cases/1000 Lung Patients (F) Lung Patients (M) Median Age (Years) Life Expectancy (M) Life Expectancy (F) Total_tests_conducted Out_Travels (mill.) In_travels(mill.) Domestic_Travels (mill.)
0 22-01-2020 NaN Afghanistan 0 0 0 0 33 65 5.89 ... 0.0 36.31 39.33 18.4 63.2 63.6 1019 1.561605 Not Reported Not Reported
1 23-01-2020 NaN Afghanistan 0 0 0 0 33 65 5.56 ... 0.0 36.31 39.33 18.4 63.2 63.6 1019 1.561605 Not Reported Not Reported
2 24-01-2020 NaN Afghanistan 0 0 0 0 33 65 4.50 ... 0.0 36.31 39.33 18.4 63.2 63.6 1019 1.561605 Not Reported Not Reported
3 25-01-2020 NaN Afghanistan 0 0 0 0 33 65 7.78 ... 0.0 36.31 39.33 18.4 63.2 63.6 1019 1.561605 Not Reported Not Reported
4 26-01-2020 NaN Afghanistan 0 0 0 0 33 65 6.00 ... 0.0 36.31 39.33 18.4 63.2 63.6 1019 1.561605 Not Reported Not Reported

5 rows × 32 columns

checking the temperature variation for Afghanistan:

In [254]:
af = af.drop_duplicates()

maxt = af.Temperature.max()
mint = af.Temperature.min()
print(f"max = {maxt}, min = {mint}")
af.Temperature.unique()
af["Wind_speed"].unique()
max = 14.78, min = -15.44
Out[254]:
array([ 9.4, 14.9, 10.4,  6.1, 10.8,  3.7,  2.4,  1.9,  4.5,  3.3,  1.7,
        5.7,  7.7,  4.3,  6.7, 10.1,  5.6,  7.2,  4.7,  5.9,  6.5,  5. ,
        6.4,  4. ,  5.8,  6.2,  2.8,  1.4,  3.4,  5.1,  4.6,  6.6,  3.6,
        4.4,  5.3,  3.9,  2.9,  3. ,  9. ,  4.8,  2.7,  7.5,  7.3,  4.9,
        3.2,  6. ])
In [255]:
qu = enr.loc[:,["Country_Region", "restrictions", "quarentine"]]
qu.rename(columns={"Country_Region":"Country"}, inplace =True)
print(qu.head())
print("length qu : ", len(qu))
       Country  restrictions  quarentine
0  Afghanistan             0         NaN
1  Afghanistan             0         NaN
2  Afghanistan             0         NaN
3  Afghanistan             0         NaN
4  Afghanistan             0         NaN
length qu :  18816

Lets first select the data for just one country.

In [257]:
af = af.merge(qu_Af, on="Country", how = "left")
af.tail()
Out[257]:
Date State Country Cumulative_cases Cumulative_death Daily_cases Daily_death Latitude Longitude Temperature ... Lung Patients (M) Median Age (Years) Life Expectancy (M) Life Expectancy (F) Total_tests_conducted Out_Travels (mill.) In_travels(mill.) Domestic_Travels (mill.) restrictions quarentine
4539 01-04-2020 NaN Afghanistan 239 4 43 0 33 65 7.01 ... 39.33 18.4 63.2 63.6 1019 1.561605 Not Reported Not Reported 0 NaN
4540 01-04-2020 NaN Afghanistan 239 4 43 0 33 65 7.01 ... 39.33 18.4 63.2 63.6 1019 1.561605 Not Reported Not Reported 0 NaN
4541 01-04-2020 NaN Afghanistan 239 4 43 0 33 65 7.01 ... 39.33 18.4 63.2 63.6 1019 1.561605 Not Reported Not Reported 0 NaN
4542 01-04-2020 NaN Afghanistan 239 4 43 0 33 65 7.01 ... 39.33 18.4 63.2 63.6 1019 1.561605 Not Reported Not Reported 0 NaN
4543 01-04-2020 NaN Afghanistan 239 4 43 0 33 65 7.01 ... 39.33 18.4 63.2 63.6 1019 1.561605 Not Reported Not Reported 0 NaN

5 rows × 34 columns

In [ ]:
# ang = pd.read_csv("../input/covcsd-covid19-countries-statistical-dataset/Angola_COVID19.csv")
# ang.head()

# new = qu.loc[qu["Country"]=="ahahaha"]
# print(new["Country"])
# len(new["Country"])

Finally, lets grab all the files from our statistical dataset that are also in the enriched dataset, so we will not have missing values.

In [258]:
import re
countriesDf = {}
not_in = []
i = 0
for filename in os.listdir(fpPop):
    if filename != "temperature_data.csv" and filename != "Tanzania.csv":
        #print("our filename: ", filename)
        curr_df =  pd.read_csv("../input/covcsd-covid19-countries-statistical-dataset/"+filename)

        res = re.split('_|.csv',filename)

        new = qu.loc[qu["Country"]==res[0]]
        if len(new["Country"]):
            #print("our new : ", new)
            curr_df = curr_df.merge(new, on="Country", how = "left")
            countriesDf[res[0]] = curr_df
            #print("our curr_df : ", curr_df)
        else:
            print("Dieser Lände ist nicht dort in der ENR")
            not_in.append(filename)
print("Number of countries considered : ", len(countriesDf))
Dieser Lände ist nicht dort in der ENR
Dieser Lände ist nicht dort in der ENR
Dieser Lände ist nicht dort in der ENR
Dieser Lände ist nicht dort in der ENR
Dieser Lände ist nicht dort in der ENR
Dieser Lände ist nicht dort in der ENR
Dieser Lände ist nicht dort in der ENR
Dieser Lände ist nicht dort in der ENR
Dieser Lände ist nicht dort in der ENR
Dieser Lände ist nicht dort in der ENR
Dieser Lände ist nicht dort in der ENR
Dieser Lände ist nicht dort in der ENR
Dieser Lände ist nicht dort in der ENR
Number of countries considered :  77
In [259]:
def heat(curr):
    n = curr.Cumulative_cases.max()
    data = []
    [data.append([curr.Latitude[0],curr.Longitude[0],n,curr.Country[0]]) for i in range(n)]

    m = pd.DataFrame(data, columns = ["Latitude",'Longitude',"Cumulative_cases","Country"])
    return m
        
In [260]:
from datetime import datetime
new = pd.DataFrame()
heat_df = pd.DataFrame()
i = 0
for name in countriesDf.keys():
    #print("country : ", name)
    curr = countriesDf[name].copy()
    curr = curr.drop_duplicates() ###############################################" !!!!!!!!!!!!!!!!!!!"
    #### ######
    '''
    This is only because there is some files that doesn't have the first colunm name. We could manually fix them or add the columns,
    columns, but as this was just for one file, I chose to not consider the file.
    '''
    
    if "Date" not in curr.columns :
        print(f"The country {name} has no Date column!")
        try:
            curr.rename(columns={" Date":"Date"},inplace=True)
            print("We fixed!")
        except:
            print("ich weiss nicht was passiert!")
    ### ######    
    curr['Date'] = pd.to_datetime(curr['Date'])
    curr['Date'] = curr['Date'].dt.strftime('%d-%m-%Y')
    
    date = curr["Date"].apply(lambda x : datetime.strptime(x, "%d-%m-%Y"))
    
    curr["Month"] = date.apply(lambda x : x.month)
    curr["Day"] = date.apply(lambda x : x.day)
    
    curr_is = curr.loc[:,["Country", "Date", "Cumulative_cases",'Cumulative_death','Population Density/km','Latitude','Longitude','Temperature',"Median_Age", "Life Expectancy (M)","Lung Patients (M)"]]
    curr_is['NewDate'] = pd.to_datetime(curr_is['Date']) ## creating "date" timestamp from the "Date" string
    curr_is['NewDate'] = curr_is['NewDate'].dt.strftime('%d-%m-%Y')
    curr_is['log_ConfirmedCases'] = np.log(curr_is.Cumulative_cases + 1)
    curr_is['log_deaths'] = np.log(curr_is.Cumulative_death + 1)
#     if i == 10:
#         break
    new = pd.concat([new,curr_is], ignore_index=True)
    #i += 1
    
    #### To heatmap:
    heat_df = pd.concat([heat_df,heat(curr)], ignore_index=True)    
    
print("size : ", len(new))
print("number countries : ", new.Country.unique())
#new.loc[len(new)/2-10:len(new)/2+10,:]
heat_df.tail()
The country Armenia has no Date column!
We fixed!
size :  6280
number countries :  ['Austria' 'Mozambique' 'Morocco' 'Bahrain' 'Guinea' 'Jamaica' 'Guatemala'
 'Ukraine' 'Mauritius' 'Armenia' 'Egypt' 'France' 'Albania' 'Indonesia'
 'Hungary' 'Djibouti' 'Belarus' 'Kenya' 'Germany' 'Denmark' 'Estonia'
 'Lithuania' 'Georgia' 'Laos' 'Azerbaijan' 'Ethiopia' 'Kyrgyzstan'
 'Lebanon' 'India' 'Kuwait' 'Maldives' 'Algeria' 'Thailand' 'Latvia'
 'Guyana' 'Afghanistan' 'Angola' 'Moldova' 'United Arab Emirates'
 'Ecuador' 'Mali' 'Kazakhstan' 'Greece' 'Luxembourg' 'Argentina' 'Gabon'
 'Madagascar' 'Libya' 'Malaysia' 'Tunisia' 'Malta' 'Bahamas' 'Iran'
 'Eswatini' 'Montenegro' 'Barbados' 'Turkey' 'Mongolia' 'Liberia' 'Uganda'
 'Monaco' 'Gambia' 'Andorra' 'Finland' 'Bangladesh' 'Honduras' 'Israel'
 'Belize' 'Trinidad and Tobago' 'Togo' 'Iraq' 'Italy' 'Iceland' 'Ghana'
 'Belgium' 'Mexico' 'Ireland']
Out[260]:
Latitude Longitude Cumulative_cases Country
324422 53.1424 -7.6921 2910 Ireland
324423 53.1424 -7.6921 2910 Ireland
324424 53.1424 -7.6921 2910 Ireland
324425 53.1424 -7.6921 2910 Ireland
324426 53.1424 -7.6921 2910 Ireland
In [261]:
import matplotlib.pyplot as plt     


import plotly.express as px       
import plotly.offline as py       
import seaborn as sns             
import plotly.graph_objects as go 

from plotly.subplots import make_subplots


import glob                       
In [262]:
py.init_notebook_mode(connected=True)

#Plotting the figure
fig = px.choropleth(new, locations="Country", locationmode='country names', 
                     color="log_ConfirmedCases", hover_name="Country",projection="natural earth" ,#"mercator",
                     animation_frame="NewDate",width=1000, height=800,
                     color_continuous_scale=px.colors.sequential.Viridis,
                     title='COVID-19 Cases Across World')

#Showing the figure
fig.update(layout_coloraxis_showscale=True)
py.offline.iplot(fig)

Hereby, the heatmap for the Confirmed Cases. The heatmap is just another representation of geographical density of some property. In this case, the number of Confirmed Cases for the lastest day considered in our dataset.

In [263]:
import folium
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster

m_5 = folium.Map(location=[42.32,-71.0589], tiles='cartodbpositron', zoom_start=2)

HeatMap(data=heat_df[['Latitude','Longitude']], radius=10).add_to(m_5)

m_5
Out[263]:
In [264]:
df = new.copy()
In [265]:
import plotly.express as px

fig = px.bar(df, x="Country", y="log_ConfirmedCases", color="Country",
  animation_frame="NewDate", animation_group="Country", range_y=[0,12])
fig.show()

Is quite difficult to get a good visualization with all the countries considered. Lets see only for some of them. (You can change the countries as you want)

Only for 4 european countries:

In [266]:
eur = ["Italy","France","Germany","Belgium"]
ex = df[df.Country.isin(eur)]
In [267]:
fig = px.bar(ex, x="Country", y="log_ConfirmedCases", color="Country",
  animation_frame="NewDate", animation_group="Country", range_y=[0,12])
fig.show()

Life Expectancy :

In [268]:
eur = ["Italy","Afghanistan","Ukraine","Argentina","Jamaica","Albania","Thailand","Togo"]
ex = df[df.Country.isin(eur)]
ex.head()
Out[268]:
Country Date Cumulative_cases Cumulative_death Population Density/km Latitude Longitude Temperature Median_Age Life Expectancy (M) Lung Patients (M) NewDate log_ConfirmedCases log_deaths
345 Jamaica 22-01-2020 0 0 273 18.1096 -77.2975 26.39 31.0 72.7 26.44 22-01-2020 0.0 0.0
346 Jamaica 23-01-2020 0 0 273 18.1096 -77.2975 24.06 31.0 72.7 26.44 23-01-2020 0.0 0.0
347 Jamaica 24-01-2020 0 0 273 18.1096 -77.2975 25.17 31.0 72.7 26.44 24-01-2020 0.0 0.0
348 Jamaica 25-01-2020 0 0 273 18.1096 -77.2975 26.44 31.0 72.7 26.44 25-01-2020 0.0 0.0
349 Jamaica 26-01-2020 0 0 273 18.1096 -77.2975 25.83 31.0 72.7 26.44 26-01-2020 0.0 0.0

Does the temperature play an important role for the virus dissemination? The size of the bubles is the number of Confirmed Cases (this is why it increases over the time)

In [269]:
px.scatter(ex, x="Temperature", y="Life Expectancy (M)", animation_frame="NewDate", animation_group="Country",
           size="log_ConfirmedCases", color="Country", hover_name="Country",
           log_x=False, size_max=55, range_x=[-8,35], range_y=[40,90])